This analysis on a simulated fisheries dataset aims to coarsly know starting at what added catch a catch hotspot can be detected by self-organizing maps and a clustering approach (dynamicTreeCut using Ward’s criterion).

ICCAT task 2 dataframe

Method: First load task 2 df, and group by cell ID –> then select only xLon yLat columns

#load iccat dataset and get statistical info
df <- read.xlsx("~/ETHZ/Master/Masterarbeit/TunaSOM.2/Data/t2ce_ETRO-PS1991-19_bySchool.xlsx", startRow = 7) %>% filter(FishMode == "FAD")

df.summary <- df %>% 
  group_by(YearC, TimePeriodID, xLon, yLat) %>% #group them to have total of catches per spatio-temporal cell since this is what we will simulate in the data (and not several lines per cell due to flag data)
  summarise(BET = sum(BET), YFT = sum(YFT), SKJ = sum(SKJ), Eff2 = sum(Eff2)) %>% 
  select(xLon, yLat, YearC, TimePeriodID, BET, YFT, SKJ, Eff2)

ICCATsummary <- skim(df.summary[,5:7]) %>% select(-skim_type, -numeric.hist, -complete_rate)
knitr::kable(ICCATsummary)
skim_variable n_missing numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100
BET 0 5904.540 12513.88 0 370 1720.0 6100 716252
YFT 0 7701.608 20953.44 0 688 2443.0 7126 1504760
SKJ 0 32160.481 83397.70 0 2336 9714.3 31040 3999830

Random dataframe based on ICCAT’s statistics (mean and sd)

Create empty dataframe with one row for each geographic cell and create columns of random catches

#extract only cells
df.cells <- df %>% mutate(cellID = group_indices(., xLon, yLat), cellID.hor = group_indices(., yLat, xLon)) %>% select(cellID, xLon, yLat, TimePeriodID, cellID.hor) %>% unique()

#try out generating complete random numbers with same statistics as species
#use truncnorm function to generate only positive numbers (set a=0, lower bound)
set.seed(10000)
df.random <- df.cells %>% mutate(
  rBET = rtruncnorm(a = 0, n = nrow(df.cells), mean = ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="BET"], sd = ICCATsummary$numeric.sd[ICCATsummary$skim_variable=="BET"]), 
  rYFT = rtruncnorm(a = 0, n = nrow(df.cells), mean = ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="YFT"], sd = ICCATsummary$numeric.sd[ICCATsummary$skim_variable=="YFT"]), 
  rSKJ = rtruncnorm(a = 0, n = nrow(df.cells), mean = ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="SKJ"], sd = ICCATsummary$numeric.sd[ICCATsummary$skim_variable=="SKJ"])) %>% rowid_to_column(var = "obsID")

#show statistics
summary.random <- skim(df.random[,7:9]) %>% select(-skim_type,-numeric.hist,-n_missing, -complete_rate)
knitr::kable(summary.random)
skim_variable numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100
rBET 12497.56 8598.012 0.1147307 5556.536 11168.13 18025.33 48411.49
rYFT 19895.27 14124.290 0.3026360 8725.676 17531.04 28343.12 87632.61
rSKJ 79565.80 56601.152 20.3601641 34781.825 69121.38 113543.76 415340.62

6 times 2 months BET high abundance clusters

Create dataset

set.seed(1000)
df.mixed <- df.random %>% 
  #hotspot with 50% more than mean catches in zone
  mutate(ab.1 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:2) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (3:12) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rBET = case_when(ab.1 == "high" ~ rBET + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="BET"]/2), 
                          ab.1 == "low" ~ rBET)) %>%
  
  #hotspot with 100% more than mean catches in zone
  mutate(ab.2 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (3:4) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (5:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:2) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rBET = case_when(ab.2 == "high" ~ rBET + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="BET"]), 
                          ab.2 == "low" ~ rBET)) %>%
  
  #hotspot with 1.5 times more than mean catches in zone
  mutate(ab.3 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (5:6) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (7:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:4) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rBET = case_when(ab.3 == "high" ~ rBET + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="BET"]*1.5), 
                          ab.3 == "low" ~ rBET)) %>%
  
  #hotspot with twice more than mean catches in zone
   mutate(ab.4 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (7:8) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (9:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:6) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rBET = case_when(ab.4 == "high" ~ rBET + ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="BET"]*2, 
                          ab.4 == "low" ~ rBET)) %>%
  
  #hotspot with 2.5 times more than mean catches in zone
   mutate(ab.5 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (9:10) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (11:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:8) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rBET = case_when(ab.5 == "high" ~ rBET + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="BET"]*2.5), 
                          ab.5 == "low" ~ rBET)) %>%
  
  #hotspot with 3 times more than mean catches in zone
   mutate(ab.6 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (11:12) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:10) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rBET = case_when(ab.6 == "high" ~ rBET + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="BET"]*3), 
                          ab.6 == "low" ~ rBET))

#print out summary table of hotspots
BET.hotspot1 <- df.mixed %>% filter(TimePeriodID %in% c(1:2)) %>%
  mutate(hotspot = case_when(ab.1 == "high" ~ "1 in", ab.1 == "low" ~ "1 out")) %>%
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.BET = (rBET / (rBET + rYFT + rSKJ))*100)

BET.hotspot2 <- df.mixed %>% filter(TimePeriodID %in% c(3:4)) %>%
  mutate(hotspot = case_when(ab.2 == "high" ~ "2 in", ab.2 == "low" ~ "2 out")) %>%
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.BET = (rBET / (rBET + rYFT + rSKJ))*100)

BET.hotspot3 <- df.mixed %>% filter(TimePeriodID %in% c(5:6)) %>%
  mutate(hotspot = case_when(ab.3 == "high" ~ "3 in", ab.3 == "low" ~ "3 out")) %>%
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.BET = (rBET / (rBET + rYFT + rSKJ))*100)

BET.hotspot4 <- df.mixed %>% filter(TimePeriodID %in% c(7:8)) %>%
  mutate(hotspot = case_when(ab.4 == "high" ~ "4 in", ab.4 == "low" ~ "4 out")) %>%
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.BET = (rBET / (rBET + rYFT + rSKJ))*100)

BET.hotspot5 <- df.mixed %>% filter(TimePeriodID %in% c(9:10)) %>%
  mutate(hotspot = case_when(ab.5 == "high" ~ "5 in", ab.5 == "low" ~ "5 out")) %>%
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.BET = (rBET / (rBET + rYFT + rSKJ))*100)

BET.hotspot6 <- df.mixed %>% filter(TimePeriodID %in% c(11:12)) %>%
  mutate(hotspot = case_when(ab.6 == "high" ~ "6 in", ab.6 == "low" ~ "6 out")) %>%
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.BET = (rBET / (rBET + rYFT + rSKJ))*100)

BET.all.hotspots <- full_join(BET.hotspot1, BET.hotspot2) %>% full_join(BET.hotspot3) %>% full_join(BET.hotspot4) %>% full_join(BET.hotspot5) %>% full_join(BET.hotspot6) %>% mutate(perc.BET = round(perc.BET, 2)) %>% select(hotspot, perc.BET)

knitr::kable(BET.all.hotspots)
hotspot perc.BET
1 in 13.01
1 out 11.28
2 in 15.34
2 out 11.47
3 in 16.58
3 out 11.50
4 in 19.61
4 out 11.02
5 in 22.47
5 out 10.84
6 in 23.40
6 out 11.04

Plot hotspots

SOM analysis

SOM parameters

#get data in matrix shape
data <- df.mixed[,7:9]
data <- data.matrix(data)
data <- scale(data, center = T, scale = T)

#calculate grid parameters
grid.size <- ceiling((nrow(data) ^ (1/2))*5)
svd.data <- svd(data)
svd1 <- svd.data$d[1]
svd2 <- svd.data$d[2]
y <- round(sqrt(grid.size*svd2/svd1))
x <- round(svd1/svd2*y)

SOM training and output visualisation

#fit som
set.seed(10)
som <- som(data, grid = somgrid(x,y, "hexagonal"), rlen = 10000)

Clustering using hclust + dynamicTreeCut and descriptive statistics, Ward

#distance matrix between the cells
dc <- dist(getCodes(som))

#Dendrogram
dendrogram <- hclust(dc,method="ward.D2")
plot(dendrogram,hang=-1,labels=F)

#DynamicTreeCut
dc <- as.matrix(dc)
treecut <- cutreeHybrid(dendrogram, distM = dc)
##  ..cutHeight not given, setting it to 25.9  ===>  99% of the (truncated) height range in dendro.
##  ..done.
{plot(som, type = "codes", codeRendering = "", shape = "straight", bgcol = Clusters.col[treecut$labels])
add.cluster.boundaries(som, treecut$labels)}

#also view heat maps
for(j in 1:ncol(data)){{plot(som, type = "property", property = getCodes(som)[,j], main=colnames(getCodes(som))[j], palette.name=coolBlueHotRed, shape = "straight")
  add.cluster.boundaries(som, treecut$labels)}}

#incorporate cluster assignation to dataset
clusters <- as_tibble(treecut$labels) %>% rowid_to_column(var = "node") %>% mutate(cluster.Ward = value) %>% select(-value)
nodes <- as_tibble(som$unit.classif) %>% rowid_to_column(var = "obsID") %>% mutate(node = value) %>% select(-value)
clusters <- inner_join(nodes, clusters, by = "node")
df.mixed <- df.mixed %>% inner_join(clusters, by = "obsID")

### Extract clusters of interest using distances between clusters’ medians

df.medians <- df.mixed %>% group_by(cluster.Ward) %>% summarise(rBET = median(rBET), rYFT = median(rYFT), rSKJ = median(rSKJ))

#For vulnerable tuna species: BET and YFT
##for each species, sort by median and select clusters above highest distance
###BET
BET.medians <- df.medians %>% select(cluster.Ward, rBET) %>% 
  arrange(desc(rBET)) %>% rownames_to_column("rank") %>% mutate(rank = as.integer(rank)) %>%
  mutate(distance = ave(rBET, FUN = function(x) -c(0,diff(x))))

####select all clusters above max(distance)
limit.rank <- BET.medians[BET.medians$distance == max(BET.medians$distance),] %>% select(rank) %>% unlist() %>% as.double()
BET.medians <- BET.medians %>% filter(rank < limit.rank) %>% select(cluster.Ward) %>% as.matrix()

##only BET and YFT for first 3 months
##bind two vectors 
df.visual <- df.mixed %>% filter(cluster.Ward %in% BET.medians)
for(j in 1:12){
  print(ggplot() + geom_sf(data = world) + geom_point(data = df.visual[df.visual$TimePeriodID == j,], aes(x = xLon, y = yLat, color = as.factor(cluster.Ward)), pch = 15) + 
          scale_colour_manual(breaks = c(1:16), values = Clusters.col, labels = c(1:16)) + 
          theme_bw() + labs(title = "High BET abundance clusters", caption = "clustering: hclust + cuTreeHybrid, Ward", subtitle = paste("Month", j)) + coord_sf(xlim = c(-40,20), ylim = c(-25,25), expand = FALSE)+ guides(colour = guide_legend(title ="Clusters")))}

Create a vulnerability index by cluster

library(scales)
## For tuna species only
###set weight factors for each species
fac.BET <- 10 #set factors as to make BET stand out but still include YFT
fac.YFT <- 5
fac.SKJ <- 1

#take again this df.median
df.medians <- df.mixed %>% group_by(cluster.Ward) %>% summarise(rBET = median(rBET), rYFT = median(rYFT), rSKJ = median(rSKJ)) %>% #create tuna vulnerability index by column
  mutate(vulnerability.tuna = round((fac.BET*rBET + fac.YFT*rYFT + fac.SKJ *rSKJ)/((fac.BET+fac.YFT+fac.SKJ)*1000))) %>%
  mutate(vulnerability.tuna = round(rescale(vulnerability.tuna, c(1,10)))) %>% #rescale to have values between 1 and 10
  arrange(desc(vulnerability.tuna)) %>%#arrange clusters by vulnerability
  select(cluster.Ward, vulnerability.tuna) #select only 2 columns of interest

#bind this vulnerability value to complete dataframe for plotting catches on map
df.mixed <- df.mixed %>% inner_join(df.medians, by = "cluster.Ward")

#replot boxplots with vulnerability colors
##BET
ggplot(data=df.mixed, aes(group = cluster.Ward, y=rBET)) + geom_boxplot(aes(fill = as.factor(vulnerability.tuna))) + scale_fill_manual(breaks = c(1:10), values = rev(Vulnerability.col), labels = c(1:10)) + labs(title = "BET by cluster", subtitle = "Ward") + theme_minimal() +
  theme(axis.text = element_text(size=9), axis.title = element_text(size = 10), 
        legend.text = element_text(size=10), legend.title = element_text(size=10), 
        legend.key.size = unit(0.4, "cm"), plot.title = element_text(size=12), legend.position = "bottom",
        plot.caption = element_text(hjust = 0, size = 9, face = "italic"))+
  guides(fill = guide_legend(title ="Vulnerability"))

##YFT
ggplot(data=df.mixed, aes(group = cluster.Ward, y=rYFT)) + geom_boxplot(aes(fill = as.factor(vulnerability.tuna))) + scale_fill_manual(breaks = c(1:10), values = rev(Vulnerability.col), labels = c(1:10)) + labs(title = "YFT by cluster", subtitle = "Ward") + theme_minimal() +
  theme(axis.text = element_text(size=9), axis.title = element_text(size = 10), 
        legend.text = element_text(size=10), legend.title = element_text(size=10), 
        legend.key.size = unit(0.4, "cm"), plot.title = element_text(size=12), legend.position = "bottom",
        plot.caption = element_text(hjust = 0, size = 9, face = "italic"))+
  guides(fill = guide_legend(title ="Vulnerability"))


#plot maps with color based on vulnerability in r
for(j in 1:12){
  print(ggplot() + geom_sf(data = world) + geom_point(data = df.mixed[df.mixed$TimePeriodID == j,], aes(x = xLon, y = yLat, color = as.factor(vulnerability.tuna)), pch = 15) + scale_colour_manual(values = rev(Vulnerability.col[1:10]), breaks = c(1:10), labels = (1:10)) + theme_bw() + labs(title = "Simulated dataset, vulnerability of tuna species", subtitle = paste("Month", j)) + coord_sf(xlim = c(-40,20), ylim = c(-25,25), expand = FALSE)+ guides(colour = guide_legend(title ="Vulnerability", subtitle = "Tuna species")))}

6 times 2 months YFT high abundance clusters

Create dataset

set.seed(1000)
df.mixed <- df.random %>% 
  #hotspot with 50% more than mean catches in zone
  mutate(ab.1 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:2) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (3:12) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rYFT = case_when(ab.1 == "high" ~ rYFT + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="YFT"]/2), 
                          ab.1 == "low" ~ rYFT)) %>%
  
  #hotspot with 100% more than mean catches in zone
  mutate(ab.2 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (3:4) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (5:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:2) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rYFT = case_when(ab.2 == "high" ~ rYFT + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="YFT"]), 
                          ab.2 == "low" ~ rYFT)) %>%
  
  #hotspot with 1.5 times more than mean catches in zone
  mutate(ab.3 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (5:6) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (7:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:4) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rYFT = case_when(ab.3 == "high" ~ rYFT + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="YFT"]*1.5), 
                          ab.3 == "low" ~ rYFT)) %>%
  
  #hotspot with twice more than mean catches in zone
   mutate(ab.4 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (7:8) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (9:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:6) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rYFT = case_when(ab.4 == "high" ~ rYFT + ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="YFT"]*2, 
                          ab.4 == "low" ~ rYFT)) %>%
  
  #hotspot with 2.5 times more than mean catches in zone
   mutate(ab.5 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (9:10) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (11:12) ~ "low",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:8) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rYFT = case_when(ab.5 == "high" ~ rYFT + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="YFT"]*2.5), 
                          ab.5 == "low" ~ rYFT)) %>%
  
  #hotspot with 3 times more than mean catches in zone
   mutate(ab.6 = case_when(
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (11:12) ~ "high",
    cellID %in% (400:700) & cellID.hor %in% (400:700) & TimePeriodID %in% (1:10) ~ "low",
    cellID < 400 & cellID.hor < 400 ~ "low", 
    cellID < 400 & cellID.hor > 700 ~ "low", 
    cellID > 700 & cellID.hor < 400 ~ "low", 
    cellID > 700 & cellID.hor > 700 ~ "low", 
    cellID < 400 ~ "low", cellID > 700 ~"low", 
    cellID.hor < 400 ~ "low", cellID.hor > 700 ~ "low")) %>% 
  mutate(rYFT = case_when(ab.6 == "high" ~ rYFT + (ICCATsummary$numeric.mean[ICCATsummary$skim_variable=="YFT"]*3), 
                          ab.6 == "low" ~ rYFT))

#print out summary table of hotspots
YFT.hotspot1 <- df.mixed %>% filter(TimePeriodID %in% c(1:2)) %>% 
  mutate(hotspot = case_when(ab.1 == "high" ~ "1 in", ab.1 == "low" ~ "1 out")) %>% 
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>% 
  mutate(perc.YFT = (rYFT / (rBET + rYFT + rSKJ))*100)

YFT.hotspot2 <- df.mixed %>% filter(TimePeriodID %in% c(3:4)) %>% 
  mutate(hotspot = case_when(ab.2 == "high" ~ "2 in", ab.2 == "low" ~ "2 out")) %>% 
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>% 
  mutate(perc.YFT = (rYFT / (rBET + rYFT + rSKJ))*100)

YFT.hotspot3 <- df.mixed %>% filter(TimePeriodID %in% c(5:6)) %>% 
  mutate(hotspot = case_when(ab.3 == "high" ~ "3 in", ab.3 == "low" ~ "3 out")) %>% 
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>% 
  mutate(perc.YFT = (rYFT / (rBET + rYFT + rSKJ))*100)

YFT.hotspot4 <- df.mixed %>% filter(TimePeriodID %in% c(7:8)) %>% 
  mutate(hotspot = case_when(ab.4 == "high" ~ "4 in", ab.4 == "low" ~ "4 out")) %>% 
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>% 
  mutate(perc.YFT = (rYFT / (rBET + rYFT + rSKJ))*100)

YFT.hotspot5 <- df.mixed %>% filter(TimePeriodID %in% c(9:10)) %>% 
  mutate(hotspot = case_when(ab.5 == "high" ~ "5 in", ab.5 == "low" ~ "5 out")) %>% 
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.YFT = (rYFT / (rBET + rYFT + rSKJ))*100)

YFT.hotspot6 <- df.mixed %>% filter(TimePeriodID %in% c(11:12)) %>%
  mutate(hotspot = case_when(ab.6 == "high" ~ "6 in", ab.6 == "low" ~ "6 out")) %>%
  group_by(hotspot) %>% summarise(rBET = sum(rBET), rYFT = sum(rYFT), rSKJ = sum(rSKJ)) %>%
  mutate(perc.YFT = (rYFT / (rBET + rYFT + rSKJ))*100)

YFT.all.hotspots <- full_join(YFT.hotspot1, YFT.hotspot2) %>% full_join(YFT.hotspot3) %>% full_join(YFT.hotspot4) %>%
  full_join(YFT.hotspot5) %>% full_join(YFT.hotspot6) %>% 
  mutate(perc.YFT = round(perc.YFT, 2)) %>% select(hotspot, perc.YFT)

knitr::kable(YFT.all.hotspots)
hotspot perc.YFT
1 in 19.79
1 out 17.58
2 in 23.58
2 out 18.01
3 in 25.67
3 out 18.16
4 in 27.96
4 out 16.99
5 in 30.84
5 out 17.58
6 in 32.62
6 out 18.07

Plot hotspots

SOM analysis

SOM parameters

#get data in matrix shape
data <- df.mixed[,7:9]
data <- data.matrix(data)
data <- scale(data, center = T, scale = T)

#calculate grid parameters
grid.size <- ceiling((nrow(data) ^ (1/2))*5)
svd.data <- svd(data)
svd1 <- svd.data$d[1]
svd2 <- svd.data$d[2]
y <- round(sqrt(grid.size*svd2/svd1))
x <- round(svd1/svd2*y)

SOM training and output visualisation

#fit som
set.seed(10)
som <- som(data, grid = somgrid(x,y, "hexagonal"), rlen = 10000)

Clustering using hclust + dynamicTreeCut and descriptive statistics, Ward

##  ..cutHeight not given, setting it to 24  ===>  99% of the (truncated) height range in dendro.
##  ..done.

Extract clusters of interest using distances between clusters’ medians

df.medians <- df.mixed %>% group_by(cluster.Ward) %>% summarise(rBET = median(rBET), rYFT = median(rYFT), rSKJ = median(rSKJ))

#sort by median
YFT.medians <- df.medians %>% select(cluster.Ward, rYFT) %>% 
  arrange(desc(rYFT)) %>% rownames_to_column("rank") %>% mutate(rank = as.integer(rank)) %>%
  mutate(distance = ave(rYFT, FUN = function(x) -c(0,diff(x))))

####select all clusters above max(distance)
limit.rank <- YFT.medians[YFT.medians$distance == max(YFT.medians$distance),] %>% select(rank) %>% unlist() %>% as.double()
YFT.medians <- YFT.medians %>% filter(rank < limit.rank) %>% select(cluster.Ward) %>% as.matrix()

 
df.visual <- df.mixed %>% filter(cluster.Ward %in% YFT.medians)
for(j in 1:12){
  print(ggplot() + geom_sf(data = world) + geom_point(data = df.visual[df.visual$TimePeriodID == j,], aes(x = xLon, y = yLat, color = as.factor(cluster.Ward)), pch = 15) + 
          scale_colour_manual(breaks = c(1:16), values = Clusters.col, labels = c(1:16)) + 
          theme_bw() + labs(title = "High YFT abundance clusters", caption = "clustering: hclust + cuTreeHybrid, Ward", subtitle = paste("Month", j)) + coord_sf(xlim = c(-40,20), ylim = c(-25,25), expand = FALSE)+ guides(colour = guide_legend(title ="Clusters")))}

Create a vulnerability index by cluster

library(scales)
## For tuna species only
###set weight factors for each species
fac.BET <- 5 
fac.YFT <- 10 #set factors as to make YFT stand out but still include BET in indice
fac.SKJ <- 1

#take again this df.median
df.medians <- df.mixed %>% group_by(cluster.Ward) %>% summarise(rBET = median(rBET), rYFT = median(rYFT), rSKJ = median(rSKJ)) %>% #create tuna vulnerability index by column
  mutate(vulnerability.tuna = round((fac.BET*rBET + fac.YFT*rYFT + fac.SKJ *rSKJ)/((fac.BET+fac.YFT+fac.SKJ)*1000))) %>%
  mutate(vulnerability.tuna = round(rescale(vulnerability.tuna, c(1,10)))) %>% #rescale to have values between 1 and 10
  arrange(desc(vulnerability.tuna)) %>%#arrange clusters by vulnerability
  select(cluster.Ward, vulnerability.tuna) #select only 2 columns of interest

#bind this vulnerability value to complete dataframe for plotting catches on map
df.mixed <- df.mixed %>% inner_join(df.medians, by = "cluster.Ward")

#replot boxplots with vulnerability colors
##BET
ggplot(data=df.mixed, aes(group = cluster.Ward, y=rBET)) + geom_boxplot(aes(fill = as.factor(vulnerability.tuna))) + scale_fill_manual(breaks = c(1:10), values = rev(Vulnerability.col), labels = c(1:10)) + labs(title = "BET by cluster", subtitle = "Ward") + theme_minimal() +
  theme(axis.text = element_text(size=9), axis.title = element_text(size = 10), 
        legend.text = element_text(size=10), legend.title = element_text(size=10), 
        legend.key.size = unit(0.4, "cm"), plot.title = element_text(size=12), legend.position = "bottom",
        plot.caption = element_text(hjust = 0, size = 9, face = "italic"))+
  guides(fill = guide_legend(title ="Vulnerability"))

##YFT
ggplot(data=df.mixed, aes(group = cluster.Ward, y=rYFT)) + geom_boxplot(aes(fill = as.factor(vulnerability.tuna))) + scale_fill_manual(breaks = c(1:10), values = rev(Vulnerability.col), labels = c(1:10)) + labs(title = "YFT by cluster", subtitle = "Ward") + theme_minimal() +
  theme(axis.text = element_text(size=9), axis.title = element_text(size = 10), 
        legend.text = element_text(size=10), legend.title = element_text(size=10), 
        legend.key.size = unit(0.4, "cm"), plot.title = element_text(size=12), legend.position = "bottom",
        plot.caption = element_text(hjust = 0, size = 9, face = "italic"))+
  guides(fill = guide_legend(title ="Vulnerability"))


#plot maps with color based on vulnerability in r
for(j in 1:12){
  print(ggplot() + geom_sf(data = world) + geom_point(data = df.mixed[df.mixed$TimePeriodID == j,], aes(x = xLon, y = yLat, color = as.factor(vulnerability.tuna)), pch = 15) + scale_colour_manual(values = rev(Vulnerability.col[1:10]), breaks = c(1:10), labels = (1:10)) + theme_bw() + labs(title = "Simulated dataset, vulnerability of tuna species", subtitle = paste("Month", j)) + coord_sf(xlim = c(-40,20), ylim = c(-25,25), expand = FALSE)+ guides(colour = guide_legend(title ="Vulnerability", subtitle = "Tuna species")))}